# Import all packages that we going to use
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
%matplotlib inline
# Load the dataset and print the first rows to check the dataset
df = pd.read_csv('201902-fordgobike-tripdata.csv')
df.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 2019-02-28 18:53:21.7890 | 2019-03-01 06:42:03.0560 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | -122.393170 | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 2019-02-28 12:13:13.2180 | 2019-03-01 05:24:08.1460 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.0100 | 2019-03-01 04:02:36.8420 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.5490 | 2019-03-01 00:20:44.0740 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
# Get an overview of the dataset and datatypes of rach column and non-null values
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null object 2 end_time 183412 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 start_station_latitude 183412 non-null float64 6 start_station_longitude 183412 non-null float64 7 end_station_id 183215 non-null float64 8 end_station_name 183215 non-null object 9 end_station_latitude 183412 non-null float64 10 end_station_longitude 183412 non-null float64 11 bike_id 183412 non-null int64 12 user_type 183412 non-null object 13 member_birth_year 175147 non-null float64 14 member_gender 175147 non-null object 15 bike_share_for_all_trip 183412 non-null object dtypes: float64(7), int64(2), object(7) memory usage: 22.4+ MB
# Get statistical overview of all the dataset
df.describe(include = 'all')
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 183412.000000 | 183412 | 183412 | 183215.000000 | 183215 | 183412.000000 | 183412.000000 | 183215.000000 | 183215 | 183412.000000 | 183412.000000 | 183412.000000 | 183412 | 175147.000000 | 175147 | 183412 |
| unique | NaN | 183401 | 183397 | NaN | 329 | NaN | NaN | NaN | 329 | NaN | NaN | NaN | 2 | NaN | 3 | 2 |
| top | NaN | 2019-02-25 08:52:07.5820 | 2019-02-11 08:03:22.6020 | NaN | Market St at 10th St | NaN | NaN | NaN | San Francisco Caltrain Station 2 (Townsend St... | NaN | NaN | NaN | Subscriber | NaN | Male | No |
| freq | NaN | 2 | 2 | NaN | 3904 | NaN | NaN | NaN | 4857 | NaN | NaN | NaN | 163544 | NaN | 130651 | 166053 |
| mean | 726.078435 | NaN | NaN | 138.590427 | NaN | 37.771223 | -122.352664 | 136.249123 | NaN | 37.771427 | -122.352250 | 4472.906375 | NaN | 1984.806437 | NaN | NaN |
| std | 1794.389780 | NaN | NaN | 111.778864 | NaN | 0.099581 | 0.117097 | 111.515131 | NaN | 0.099490 | 0.116673 | 1664.383394 | NaN | 10.116689 | NaN | NaN |
| min | 61.000000 | NaN | NaN | 3.000000 | NaN | 37.317298 | -122.453704 | 3.000000 | NaN | 37.317298 | -122.453704 | 11.000000 | NaN | 1878.000000 | NaN | NaN |
| 25% | 325.000000 | NaN | NaN | 47.000000 | NaN | 37.770083 | -122.412408 | 44.000000 | NaN | 37.770407 | -122.411726 | 3777.000000 | NaN | 1980.000000 | NaN | NaN |
| 50% | 514.000000 | NaN | NaN | 104.000000 | NaN | 37.780760 | -122.398285 | 100.000000 | NaN | 37.781010 | -122.398279 | 4958.000000 | NaN | 1987.000000 | NaN | NaN |
| 75% | 796.000000 | NaN | NaN | 239.000000 | NaN | 37.797280 | -122.286533 | 235.000000 | NaN | 37.797320 | -122.288045 | 5502.000000 | NaN | 1992.000000 | NaN | NaN |
| max | 85444.000000 | NaN | NaN | 398.000000 | NaN | 37.880222 | -121.874119 | 398.000000 | NaN | 37.880222 | -121.874119 | 6645.000000 | NaN | 2001.000000 | NaN | NaN |
# Checking the values of user_type column
df.user_type.value_counts()
Subscriber 163544 Customer 19868 Name: user_type, dtype: int64
# # Checking the values of member_gender column
df.member_gender.value_counts()
Male 130651 Female 40844 Other 3652 Name: member_gender, dtype: int64
# Drop all NaN values from the table
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null object 2 end_time 174952 non-null object 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null object 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null object 15 bike_share_for_all_trip 174952 non-null object dtypes: float64(7), int64(2), object(7) memory usage: 22.7+ MB
# Change the data type of 'user_type' and 'member_gender' columns into category data type
df.user_type = df.user_type.astype('category')
df.member_gender = df.member_gender.astype('category')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null object 2 end_time 174952 non-null object 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null category 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null category 15 bike_share_for_all_trip 174952 non-null object dtypes: category(2), float64(7), int64(2), object(5) memory usage: 20.4+ MB
# Change the data type of 'start_time' and 'end_time' columns into datetime data type
df.start_time = pd.to_datetime(df.start_time)
df.end_time = pd.to_datetime(df.end_time)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null category 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null category 15 bike_share_for_all_trip 174952 non-null object dtypes: category(2), datetime64[ns](2), float64(7), int64(2), object(3) memory usage: 20.4+ MB
# change the data type of 'bike_share_for_all_trip' into boolean data type
df.bike_share_for_all_trip = (df.bike_share_for_all_trip == 'Yes')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null category 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null category 15 bike_share_for_all_trip 174952 non-null bool dtypes: bool(1), category(2), datetime64[ns](2), float64(7), int64(2), object(2) memory usage: 19.2+ MB
# Change the data type of ['start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude', 'bike_id'] from float into string
str_columns = ['start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude', 'bike_id']
for x in str_columns:
df[x] = df[x].astype(str)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null object 6 start_station_longitude 174952 non-null object 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null object 10 end_station_longitude 174952 non-null object 11 bike_id 174952 non-null object 12 user_type 174952 non-null category 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null category 15 bike_share_for_all_trip 174952 non-null bool dtypes: bool(1), category(2), datetime64[ns](2), float64(3), int64(1), object(7) memory usage: 19.2+ MB
# Make new columns for start time in 24H's, the start day of the week, and start date only, and members age
df['start_hour_of_the_day'] = df.start_time.dt.strftime('%H')
df['start_day_of_the_week'] = df.start_time.dt.strftime('%A')
df['start_date'] = df.start_time.dt.strftime('%Y-%m-%d')
df['member_age'] = 2022 - df.member_birth_year
df.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | start_hour_of_the_day | start_day_of_the_week | start_date | member_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.7896254 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | False | 17 | Thursday | 2019-02-28 | 38.0 |
| 2 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.7693053 | -122.4268256 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.78637526861584 | -122.40490436553954 | 5905 | Customer | 1972.0 | Male | False | 12 | Thursday | 2019-02-28 | 50.0 |
| 3 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.77483629413345 | -122.44654566049576 | 70.0 | Central Ave at Fell St | 37.77331087889723 | -122.44429260492323 | 6638 | Subscriber | 1989.0 | Other | False | 17 | Thursday | 2019-02-28 | 33.0 |
| 4 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.8045623549303 | -122.27173805236816 | 222.0 | 10th Ave at E 15th St | 37.7927143 | -122.2487796 | 4898 | Subscriber | 1974.0 | Male | True | 23 | Thursday | 2019-02-28 | 48.0 |
| 5 | 1793 | 2019-02-28 23:49:58.632 | 2019-03-01 00:19:51.760 | 93.0 | 4th St at Mission Bay Blvd S | 37.7704074 | -122.3911984 | 323.0 | Broadway at Kearny | 37.79801364395978 | -122.40595042705534 | 5200 | Subscriber | 1959.0 | Male | False | 23 | Thursday | 2019-02-28 | 63.0 |
df.member_birth_year = df.member_birth_year.astype('int')
df.member_age = df.member_age.astype('int')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null object 6 start_station_longitude 174952 non-null object 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null object 10 end_station_longitude 174952 non-null object 11 bike_id 174952 non-null object 12 user_type 174952 non-null category 13 member_birth_year 174952 non-null int32 14 member_gender 174952 non-null category 15 bike_share_for_all_trip 174952 non-null bool 16 start_hour_of_the_day 174952 non-null object 17 start_day_of_the_week 174952 non-null object 18 start_date 174952 non-null object 19 member_age 174952 non-null int32 dtypes: bool(1), category(2), datetime64[ns](2), float64(2), int32(2), int64(1), object(10) memory usage: 23.2+ MB
# Make a new column with duration og the trip in mins instead of seconds
df['duration_min'] = df.duration_sec / 60
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null object 6 start_station_longitude 174952 non-null object 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null object 10 end_station_longitude 174952 non-null object 11 bike_id 174952 non-null object 12 user_type 174952 non-null category 13 member_birth_year 174952 non-null int32 14 member_gender 174952 non-null category 15 bike_share_for_all_trip 174952 non-null bool 16 start_hour_of_the_day 174952 non-null object 17 start_day_of_the_week 174952 non-null object 18 start_date 174952 non-null object 19 member_age 174952 non-null int32 20 duration_min 174952 non-null float64 dtypes: bool(1), category(2), datetime64[ns](2), float64(3), int32(2), int64(1), object(10) memory usage: 24.5+ MB
# Get an overview statistics of the new table
df.describe(include = 'all')
C:\Users\aswan\AppData\Local\Temp\ipykernel_12832\2418990127.py:2: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now. df.describe(include = 'all') C:\Users\aswan\AppData\Local\Temp\ipykernel_12832\2418990127.py:2: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now. df.describe(include = 'all')
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | start_hour_of_the_day | start_day_of_the_week | start_date | member_age | duration_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 174952.000000 | 174952 | 174952 | 174952.000000 | 174952 | 174952 | 174952 | 174952.000000 | 174952 | 174952 | ... | 174952 | 174952 | 174952.000000 | 174952 | 174952 | 174952 | 174952 | 174952 | 174952.000000 | 174952.000000 |
| unique | NaN | 174941 | 174939 | NaN | 329 | 329 | 329 | NaN | 329 | 329 | ... | 4607 | 2 | NaN | 3 | 2 | 24 | 7 | 28 | NaN | NaN |
| top | NaN | 2019-02-11 17:05:07.840000 | 2019-02-28 17:40:37.328000 | NaN | Market St at 10th St | 37.776619 | -122.417385 | NaN | San Francisco Caltrain Station 2 (Townsend St... | 37.7766392 | ... | 4794 | Subscriber | NaN | Male | False | 17 | Thursday | 2019-02-28 | NaN | NaN |
| freq | NaN | 2 | 2 | NaN | 3649 | 3649 | 3649 | NaN | 4624 | 4624 | ... | 179 | 158386 | NaN | 130500 | 157606 | 20904 | 33712 | 9448 | NaN | NaN |
| first | NaN | 2019-02-01 00:00:20.636000 | 2019-02-01 00:04:52.058000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| last | NaN | 2019-02-28 23:59:18.548000 | 2019-03-01 08:01:55.975000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | 704.002744 | NaN | NaN | 139.002126 | NaN | NaN | NaN | 136.604486 | NaN | NaN | ... | NaN | NaN | 1984.803135 | NaN | NaN | NaN | NaN | NaN | 37.196865 | 11.733379 |
| std | 1642.204905 | NaN | NaN | 111.648819 | NaN | NaN | NaN | 111.335635 | NaN | NaN | ... | NaN | NaN | 10.118731 | NaN | NaN | NaN | NaN | NaN | 10.118731 | 27.370082 |
| min | 61.000000 | NaN | NaN | 3.000000 | NaN | NaN | NaN | 3.000000 | NaN | NaN | ... | NaN | NaN | 1878.000000 | NaN | NaN | NaN | NaN | NaN | 21.000000 | 1.016667 |
| 25% | 323.000000 | NaN | NaN | 47.000000 | NaN | NaN | NaN | 44.000000 | NaN | NaN | ... | NaN | NaN | 1980.000000 | NaN | NaN | NaN | NaN | NaN | 30.000000 | 5.383333 |
| 50% | 510.000000 | NaN | NaN | 104.000000 | NaN | NaN | NaN | 101.000000 | NaN | NaN | ... | NaN | NaN | 1987.000000 | NaN | NaN | NaN | NaN | NaN | 35.000000 | 8.500000 |
| 75% | 789.000000 | NaN | NaN | 239.000000 | NaN | NaN | NaN | 238.000000 | NaN | NaN | ... | NaN | NaN | 1992.000000 | NaN | NaN | NaN | NaN | NaN | 42.000000 | 13.150000 |
| max | 84548.000000 | NaN | NaN | 398.000000 | NaN | NaN | NaN | 398.000000 | NaN | NaN | ... | NaN | NaN | 2001.000000 | NaN | NaN | NaN | NaN | NaN | 144.000000 | 1409.133333 |
13 rows × 21 columns
# Get the first 10 rows of the data for better understanding of the data
df.head(10)
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | start_hour_of_the_day | start_day_of_the_week | start_date | member_age | duration_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.7896254 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | 4902 | Customer | 1984 | Male | False | 17 | Thursday | 2019-02-28 | 38 | 869.750000 |
| 2 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.7693053 | -122.4268256 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.78637526861584 | ... | 5905 | Customer | 1972 | Male | False | 12 | Thursday | 2019-02-28 | 50 | 1030.900000 |
| 3 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.77483629413345 | -122.44654566049576 | 70.0 | Central Ave at Fell St | 37.77331087889723 | ... | 6638 | Subscriber | 1989 | Other | False | 17 | Thursday | 2019-02-28 | 33 | 608.166667 |
| 4 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.8045623549303 | -122.27173805236816 | 222.0 | 10th Ave at E 15th St | 37.7927143 | ... | 4898 | Subscriber | 1974 | Male | True | 23 | Thursday | 2019-02-28 | 48 | 26.416667 |
| 5 | 1793 | 2019-02-28 23:49:58.632 | 2019-03-01 00:19:51.760 | 93.0 | 4th St at Mission Bay Blvd S | 37.7704074 | -122.3911984 | 323.0 | Broadway at Kearny | 37.79801364395978 | ... | 5200 | Subscriber | 1959 | Male | False | 23 | Thursday | 2019-02-28 | 63 | 29.883333 |
| 6 | 1147 | 2019-02-28 23:55:35.104 | 2019-03-01 00:14:42.588 | 300.0 | Palm St at Willow St | 37.3172979 | -121.884995 | 312.0 | San Jose Diridon Station | 37.329732 | ... | 3803 | Subscriber | 1983 | Female | False | 23 | Thursday | 2019-02-28 | 39 | 19.116667 |
| 7 | 1615 | 2019-02-28 23:41:06.766 | 2019-03-01 00:08:02.756 | 10.0 | Washington St at Kearny St | 37.79539293725452 | -122.4047702550888 | 127.0 | Valencia St at 21st St | 37.7567083 | ... | 6329 | Subscriber | 1989 | Male | False | 23 | Thursday | 2019-02-28 | 33 | 26.916667 |
| 8 | 1570 | 2019-02-28 23:41:48.790 | 2019-03-01 00:07:59.715 | 10.0 | Washington St at Kearny St | 37.79539293725452 | -122.4047702550888 | 127.0 | Valencia St at 21st St | 37.7567083 | ... | 6548 | Subscriber | 1988 | Other | False | 23 | Thursday | 2019-02-28 | 34 | 26.166667 |
| 9 | 1049 | 2019-02-28 23:49:47.699 | 2019-03-01 00:07:17.025 | 19.0 | Post St at Kearny St | 37.788975 | -122.403452 | 121.0 | Mission Playground | 37.7592103 | ... | 6488 | Subscriber | 1992 | Male | False | 23 | Thursday | 2019-02-28 | 30 | 17.483333 |
| 10 | 458 | 2019-02-28 23:57:57.211 | 2019-03-01 00:05:35.435 | 370.0 | Jones St at Post St | 37.7873267660624 | -122.41327822208405 | 43.0 | San Francisco Public Library (Grove St at Hyde... | 37.7787677 | ... | 5318 | Subscriber | 1996 | Female | True | 23 | Thursday | 2019-02-28 | 26 | 7.633333 |
10 rows × 21 columns
The dataset has 174952 rows and 21 columns features informations about goBike company rides during feburary month in 2019, information about ('duration_sec', 'start_time', 'end_time', 'start_station_id','start_station_name', 'start_station_latitude', 'start_station_longitude', 'end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type', 'member_birth_year', 'member_gender', 'bike_share_for_all_trip', 'start_hour_of_the_day', 'start_day_of_the_week', 'start_date', 'member_age', 'duration_min'), columns are divided into 1 boolean, 2 category,2 datetime, 3 float,3 int, and 10 object
I'm interested the most in finding a relation between Subscriber and Customers and how to convert customers into subscribes, by finding the common habits of Subscribers
I think the finding a pattern between Subscriber habits and customers habits would help me
def plt_custom_attr(title):
plt.figure(figsize= [20, 10])
plt.title(title, fontsize=24)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
def plt_custom_attr_x_y(x_label, y_label):
plt.xlabel(x_label, fontsize = 18)
plt.ylabel(y_label, fontsize = 18)
I'll start by looking at the distribution of the main variables of interest
df.duration_min.max(), df.duration_min.min()
(1409.1333333333334, 1.0166666666666666)
duration_min_adj = df.duration_min[df.duration_min.between(df.duration_min.quantile(0.01), df.duration_min.quantile(0.99))]
duration_min_adj.max()
52.93333333333333
plt_custom_attr('Trips durations in minutes')
plt.hist(duration_min_adj, bins = 50)
plt_custom_attr_x_y('Durations in minutes','Count')
plt.xticks([0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]);
Most trip durations are between 0 and 10 minutes, trip duration increase at first till it reachs its peak at 4 mins then it start to decrease exponentially.
hours = ['00', '01', '02', '03' '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
plt_custom_attr('Trips start time of the day')
sns.countplot(data = df, x = 'start_hour_of_the_day', color = '#4169e1', order = hours)
plt_custom_attr_x_y('Hour of the day','Count');
There's two surges in start trip time, first one at 8 in the morning and second one is at 5 in the evening which alligns with the normal start and end working hours
weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt_custom_attr('Trips per days of the week')
sns.countplot(data=df, x='start_day_of_the_week', color='#4169e1', order = weekday)
plt_custom_attr_x_y('Days of the week', 'Count');
Most busy days of the week are week days and there are drops in rides during weekend which could means that a lot of people using the bikes for going and coming from work
plt.figure(figsize= [20, 10])
plt_custom_attr('Trips per user type')
sns.countplot(data=df, x='user_type')
plt_custom_attr_x_y('User_type','Count');
<Figure size 1440x720 with 0 Axes>
The vast majority of rides made by users are by subscribers
plt_custom_attr('Trips per user gender')
sns.countplot(data = df, x = 'member_gender')
plt_custom_attr_x_y('Users gender','Count');
The majority of trip were made by male users 3 times more female useres
Most of the trips durations are between 1 and 10 minutes long which could mean that a lot of people are using bikes for short trips as mean of transportation.
Top start times for the trips are 8am and 5pm with average use in between those times and less rides at the night
Bike trips are more common during week days and much less during weekend
Subscribers are 8 times more than customers (but you can't tell number of users because there's no unique identfications for users)
Most of the trips were made by males than females
plt_custom_attr('Trips duration over days of the week')
sns.barplot(data = df, x='start_day_of_the_week', y='duration_min', order = weekday, color='#4169e1')
plt_custom_attr_x_y('Days of the week','Average ride duration');
The average ride duration during week days is lower than average ride duration during weekend
plt_custom_attr('Average age among different users groups')
sns.violinplot(data = df, x = 'user_type', y = 'member_age', inner='quart')
plt_custom_attr_x_y('User type', 'Member age');
df.member_age.max()
144
df = df[(np.abs(stats.zscore(df.member_age)) < 3)]
df.member_age.max()
67
plt_custom_attr('Average age among different users groups')
sns.violinplot(data = df, x = 'user_type', y = 'member_age', inner='quart')
plt_custom_attr_x_y('User type', 'Member age');
The average age of customers and subscribers are 35 but at 1st quantile subscribers are slighter higher than customers
plt_custom_attr('Trips start time of the day Among different users types')
sns.countplot(data = df, x='start_hour_of_the_day', hue='user_type', order = hours)
plt_custom_attr_x_y('Hour of the day', 'Count');
For subsribers there's two surges in start trip time, first one at 8 in the morning and second one is at 5 in the evening, but with customers there's small difference between those 2 times
plt_custom_attr('Average ride duration among differen members age')
sns.regplot(data = df, x = 'member_age', y = 'duration_min', marker = '.' , x_jitter=0.5, scatter_kws={'alpha':3/10})
plt_custom_attr_x_y('Member age', 'Count');
There's no significant difference between ride time among different age groups
plt_custom_attr('Count of users among different genders')
sns.countplot(data = df, hue = 'user_type', x = 'member_gender')
plt_custom_attr_x_y('User gender', 'Count')
plt.legend(fontsize = 20);
The majority of trip were made by male users, but the ratio between customer to subscriber in female users are higher
plt_custom_attr('Start stations locations with most rides')
sns.countplot(data = df, y= 'start_station_name', order=df.start_station_name.value_counts().iloc[:15].index, hue = 'user_type')
plt_custom_attr_x_y('Count', 'Start station name')
plt.legend(fontsize = 20);
plt.figure(figsize= [20, 15])
fig = px.box(df, x="start_day_of_the_week", y="duration_min", color="bike_share_for_all_trip")
fig.update_layout(title = "Trips duration among different days of the week",
yaxis_title = "Ride duration",
xaxis_title = "Day of the week")
fig.show();
<Figure size 1440x1080 with 0 Axes>
There's more ride sharing among short duration rides
plt_custom_attr('Average ride duration among different user types during week days')
sns.pointplot(data = df, x='start_day_of_the_week', y='duration_min', hue='user_type', dodge=0.3, linestyles="", order = weekday)
plt_custom_attr_x_y('Days of the week', 'Average ride duration')
plt.legend(fontsize = 20);
Average ride duration are higher during weekend than weekdays amonth customers and subscribers
plt_custom_attr('Age distrubtion among different user gender and user type')
sns.violinplot(data=df, x="member_gender", y="member_age", hue="user_type", split=True, inner="quart", linewidth=1)
plt_custom_attr_x_y('User gender', 'User age')
plt.legend(fontsize = 20);
plt_custom_attr('Ride duration among member age and user type')
sns.stripplot(data = df, x = 'member_age', y = 'duration_min', hue = 'user_type', marker = '.' , jitter = 0.5 , alpha = 0.3)
plt_custom_attr_x_y('Member age', 'Ride duration')
plt.legend(fontsize = 20);